{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected = [\n",
    "    '/home/husein/ssd3/instructions/synthetic-codealpaca-v1-chatgpt4.jsonl',\n",
    "    '/home/husein/ssd3/instructions/synthetic-glaive_coder_raw_text.jsonl',\n",
    "    '/home/husein/ssd3/instructions/synthetic-oss_instruct-decontaminated.jsonl'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected_words = [\n",
    "    'kebutuhan',\n",
    "    'berbeda',\n",
    "    'bahwa',\n",
    "    'nomor',\n",
    "    'RMXX,XXX',\n",
    "    'kompleksitas',\n",
    "    'listrik',\n",
    "    'jawaban',\n",
    "    'teknis',\n",
    "    'berkualitas',\n",
    "    'mencoba',\n",
    "    'kampanye',\n",
    "    'komunitas',\n",
    "    'stabilitas',\n",
    "    'Stabilitas',\n",
    "    'metode',\n",
    "    'pria',\n",
    "    'butuh',\n",
    "    'jadwal',\n",
    "    'kasus',\n",
    "    'otomatis',\n",
    "    'populer',\n",
    "    'bisnis',\n",
    "    'probabilitas',\n",
    "    'rusak',\n",
    "    'kapasitas',\n",
    "    'rutinitas',\n",
    "    'pertama-tama',\n",
    "    'bagian',\n",
    "    'fungsionalitas',\n",
    "]\n",
    "\n",
    "indons = []\n",
    "def found_word(s, words):\n",
    "    global indons\n",
    "    for i in range(len(words)):\n",
    "        if words[i] in s:\n",
    "            indons.append(s)\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_code(s):\n",
    "    s = s.replace(' Kode ', ' Kod ')\n",
    "    s = s.replace(' kode ', ' kod ')\n",
    "    s = s.replace('Kode ', 'Kod ')\n",
    "    s = s.replace('kodenya', 'kodnya')\n",
    "    s = s.replace('Kodenya', 'Kodnya')\n",
    "    s = s.replace('opkode', 'opkod')\n",
    "    s = s.replace(' kode', ' kod')\n",
    "    s = s.replace('dikodekan', 'dikodkan')\n",
    "    s = s.replace('kodks', 'kodeks')\n",
    "    s = s.replace('kode ', 'kod ')\n",
    "    s = s.replace(' Kode', ' Kod')\n",
    "    return s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_news = glob('/home/husein/ssd3/google-translate-english-news/*.requested')\n",
    "len(english_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_texts = glob('/home/husein/ssd3/google-translate-english-texts/*.requested')\n",
    "len(english_texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malay_news = glob('/home/husein/ssd3/google-translate-malay-news/*.requested')\n",
    "len(malay_news)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hansard = glob('/home/husein/ssd3/google-translate-malaysia-hansard/*.requested')\n",
    "len(hansard)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ens = [english_news, english_texts]\n",
    "mss = [malay_news, hansard]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reject(k):\n",
    "    if 'return JSON' in k:\n",
    "        return\n",
    "    if 'AI language model' in k:\n",
    "        return\n",
    "    if 'model bahasa AI' in k:\n",
    "        return\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    for f in selected:\n",
    "        with open(f) as fopen:\n",
    "            for l in fopen:\n",
    "                l = json.loads(l)\n",
    "                en = l['instruction_en']\n",
    "                ms = l['instruction_ms']\n",
    "\n",
    "                if not en or not ms:\n",
    "                    continue\n",
    "\n",
    "                if not reject(en) or not reject(ms):\n",
    "                    continue\n",
    "                \n",
    "                ms = replace_code(ms)\n",
    "        \n",
    "                if found_word(ms, rejected_words):\n",
    "                    continue\n",
    "                \n",
    "                if len(en) and len(en.split()) < 1800 and len(ms) and len(ms.split()) < 1800:\n",
    "                    d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                    d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "439509it [00:44, 9980.37it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    with open('/home/husein/ssd3/gpt4all-1.3/translated-gpt4all-filtered-noncode.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            l = json.loads(l)\n",
    "            try:\n",
    "                en = l['src']['prompt']\n",
    "                ms = l['r'][0]['result']\n",
    "                \n",
    "                if not reject(en) or not reject(ms):\n",
    "                    continue\n",
    "                    \n",
    "                ms = replace_code(ms)\n",
    "        \n",
    "                if found_word(ms, rejected_words):\n",
    "                    continue\n",
    "\n",
    "                if len(en) and len(en.split()) < 1800 and len(ms) and len(ms.split()) < 1800:\n",
    "                    d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                    d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "            except:\n",
    "                pass\n",
    "            \n",
    "            try:\n",
    "                en = l['src']['response']\n",
    "                ms = l['r'][1]['result']\n",
    "                \n",
    "                if not reject(en) or not reject(ms):\n",
    "                    continue\n",
    "                    \n",
    "                ms = replace_code(ms)\n",
    "        \n",
    "                if found_word(ms, rejected_words):\n",
    "                    continue\n",
    "                \n",
    "                if len(en) and len(en.split()) < 1800 and len(ms) and len(ms.split()) < 1800:\n",
    "                    d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "\n",
    "                    d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            \n",
    "            except:\n",
    "                pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2188918 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2496675it [00:19, 128751.77it/s]\n",
      "1683631it [00:09, 171204.35it/s]\n",
      "958405it [00:10, 92013.37it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    for f in [\n",
    "        '/home/husein/ssd3/translation/processed-ms-id.jsonl', \n",
    "        '/home/husein/ssd3/translation/processed-ms-jw.jsonl'\n",
    "    ]:\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                l = json.loads(l)\n",
    "                left = l['left']\n",
    "                right = l['right']\n",
    "                if len(left) and len(left.split()) < 1800 and len(right) and len(right.split()) < 1800:\n",
    "                    d = {\"translation\": {\"src\": right, \"tgt\": left, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                    \n",
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    with open('/home/husein/ssd3/translation/process-bjn-Latn.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            l = json.loads(l)\n",
    "            left = l['left']\n",
    "            chatgpt = l['chatgpt']\n",
    "            google = l['google']\n",
    "            \n",
    "            if not reject(chatgpt) or not reject(google):\n",
    "                continue\n",
    "            \n",
    "            if len(left) and len(left.split()) < 1800 and len(google) and len(google.split()) < 1800:\n",
    "                d = {\"translation\": {\"src\": left, \"tgt\": google, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "            if len(chatgpt) and len(chatgpt.split()) < 1800 and len(google) and len(google.split()) < 1800:\n",
    "                d = {\"translation\": {\"src\": chatgpt, \"tgt\": google, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8286032 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "75777it [00:10, 7498.74it/s] \n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    with open('gpt4all-code.jsonl') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            try:\n",
    "                l = json.loads(l)\n",
    "                left = l['prompt'][0]\n",
    "                right = l['prompt'][1]\n",
    "                \n",
    "                right = replace_code(right)\n",
    "        \n",
    "                if found_word(right, rejected_words):\n",
    "                    continue\n",
    "                \n",
    "                if len(left) and len(right):\n",
    "                    d = {\"translation\": {\"src\": left, \"tgt\": right, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                    \n",
    "                    d = {\"translation\": {\"src\": right, \"tgt\": left, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                \n",
    "                left = l['response'][0]\n",
    "                right = l['response'][1]\n",
    "                \n",
    "                right = replace_code(right)\n",
    "        \n",
    "                if found_word(right, rejected_words):\n",
    "                    continue\n",
    "                \n",
    "                if len(left) and len(right):\n",
    "                    d = {\"translation\": {\"src\": left, \"tgt\": right, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                    \n",
    "                    d = {\"translation\": {\"src\": right, \"tgt\": left, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "            except Exception as e:\n",
    "                print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8555444 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "99999it [00:00, 304843.91it/s]\n",
      "100000it [00:00, 548406.22it/s]\n",
      "100000it [00:00, 540987.61it/s]\n",
      "18837it [00:00, 529477.51it/s]\n",
      "100000it [00:00, 516588.89it/s]\n",
      "100000it [00:00, 545894.50it/s]\n",
      "100000it [00:00, 545702.74it/s]\n",
      "100000it [00:00, 539987.46it/s]\n",
      "100000it [00:00, 534729.94it/s]\n",
      "100000it [00:00, 546295.51it/s]\n",
      "100000it [00:00, 550466.83it/s]\n",
      "100000it [00:00, 539278.60it/s]\n",
      "100000it [00:00, 524173.34it/s]\n",
      "100000it [00:00, 384163.43it/s]\n",
      "100000it [00:00, 481753.94it/s]\n",
      "100000it [00:00, 438353.23it/s]\n",
      "100000it [00:00, 538786.06it/s]\n",
      "100000it [00:00, 539827.61it/s]\n",
      "100000it [00:00, 545477.77it/s]\n",
      "100000it [00:00, 537957.50it/s]\n",
      "100000it [00:00, 541690.48it/s]\n",
      "100000it [00:00, 542370.64it/s]\n",
      "100000it [00:00, 365940.83it/s]\n",
      "100000it [00:00, 539326.44it/s]\n",
      "100000it [00:00, 264020.03it/s]\n",
      "100000it [00:00, 518760.61it/s]\n",
      "100000it [00:00, 532502.52it/s]\n",
      "100000it [00:00, 541452.03it/s]\n",
      "100000it [00:00, 429112.03it/s]\n",
      "100000it [00:00, 373072.75it/s]\n",
      "100000it [00:00, 529669.44it/s]\n",
      "100000it [00:00, 529913.03it/s]\n",
      "99999it [00:00, 422630.16it/s]\n",
      "99996it [00:00, 422112.19it/s]\n",
      "99995it [00:00, 414495.17it/s]\n",
      "99997it [00:00, 412367.57it/s]\n",
      "99999it [00:00, 424044.47it/s]\n",
      "99995it [00:00, 418231.69it/s]\n",
      "99997it [00:00, 428548.91it/s]\n",
      "99996it [00:00, 417028.22it/s]\n",
      "99996it [00:00, 429845.10it/s]\n",
      "99997it [00:00, 427352.47it/s]\n",
      "99994it [00:00, 418970.34it/s]\n",
      "99997it [00:00, 266696.48it/s]\n",
      "99992it [00:00, 209885.43it/s]\n",
      "99993it [00:00, 270085.58it/s]\n",
      "99993it [00:00, 423696.47it/s]\n",
      "7616it [00:00, 416010.98it/s]\n",
      "99996it [00:00, 428312.24it/s]\n",
      "99993it [00:00, 424972.43it/s]\n",
      "99996it [00:00, 323788.51it/s]\n",
      "99999it [00:00, 432781.85it/s]\n",
      "99998it [00:00, 423168.75it/s]\n",
      "99988it [00:00, 420970.07it/s]\n",
      "99992it [00:00, 430257.84it/s]\n",
      "99994it [00:00, 433366.57it/s]\n",
      "99995it [00:00, 430149.40it/s]\n",
      "99996it [00:00, 431100.76it/s]\n",
      "99990it [00:00, 418980.37it/s]\n",
      "99994it [00:00, 402221.14it/s]\n",
      "99998it [00:00, 369259.58it/s]\n",
      "99991it [00:00, 423354.83it/s]\n",
      "99995it [00:00, 425544.94it/s]\n",
      "99996it [00:00, 215132.56it/s]\n",
      "99994it [00:00, 292507.97it/s]\n",
      "99992it [00:00, 339044.35it/s]\n",
      "99993it [00:00, 224809.64it/s]\n",
      "99992it [00:00, 273990.05it/s]\n",
      "99990it [00:00, 304679.62it/s]\n",
      "99994it [00:00, 428048.82it/s]\n",
      "99993it [00:00, 432798.31it/s]\n",
      "99897it [00:00, 283239.83it/s]\n",
      "100000it [00:00, 277166.79it/s]\n",
      "99997it [00:00, 280173.35it/s]\n",
      "100000it [00:00, 289654.08it/s]\n",
      "99995it [00:00, 303793.30it/s]\n",
      "99999it [00:00, 285170.11it/s]\n",
      "99998it [00:00, 190994.50it/s]\n",
      "100000it [00:00, 246271.78it/s]\n",
      "87609it [00:00, 264930.44it/s]\n",
      "100000it [00:00, 267872.28it/s]\n",
      "100000it [00:00, 286089.71it/s]\n",
      "100000it [00:00, 298818.14it/s]\n",
      "100000it [00:00, 277034.98it/s]\n",
      "100000it [00:00, 147229.16it/s]\n",
      "100000it [00:00, 161965.06it/s]\n",
      "99989it [00:00, 129887.38it/s]\n",
      "100000it [00:00, 114523.44it/s]\n",
      "100000it [00:00, 170694.87it/s]\n",
      "99994it [00:00, 146974.65it/s]\n",
      "100000it [00:00, 162763.85it/s]\n",
      "99992it [00:00, 146985.51it/s]\n",
      "99994it [00:00, 143518.69it/s]\n",
      "27971it [00:02, 11671.80it/s]\n",
      "28150it [00:02, 11764.77it/s]\n",
      "28083it [00:02, 10271.34it/s]\n",
      "25223it [00:02, 9841.96it/s] \n",
      "28314it [00:02, 10314.43it/s]\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "with open('train.json', 'a') as fopen_jsonl:\n",
    "    for fs in ens:\n",
    "        for f in fs:\n",
    "            with open(f) as fopen:\n",
    "                for l in tqdm(fopen):\n",
    "                    \n",
    "                    if random.random() > 0.2:\n",
    "                        continue\n",
    "                        \n",
    "                    data = json.loads(l)\n",
    "                    en = data['src']\n",
    "                    ms = data['r']['result']\n",
    "\n",
    "                    if len(en) and len(en.split()) < 1800 and len(ms) and len(ms.split()) < 1800:\n",
    "                        d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                        \n",
    "                        d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                        \n",
    "    for fs in mss:\n",
    "        for f in fs:\n",
    "            with open(f) as fopen:\n",
    "                for l in tqdm(fopen):\n",
    "                    \n",
    "                    if 'malaysia-hansard' not in f and random.random() > 0.4:\n",
    "                        continue\n",
    "                    \n",
    "                    data = json.loads(l)\n",
    "                    if isinstance(data['src'], dict):\n",
    "                        s = data['src']['cleaned']\n",
    "                    else:\n",
    "                        s = data['src']\n",
    "                    ms = s\n",
    "                    en = data['r']['result']\n",
    "\n",
    "                    if len(en) and len(en.split()) < 1800 and len(ms) and len(ms.split()) < 1800:\n",
    "                        \n",
    "                        d = {\"translation\": {\"src\": en, \"tgt\": ms, 'prefix': 'terjemah ke Melayu: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                        \n",
    "                        d = {\"translation\": {\"src\": ms, \"tgt\": en, 'prefix': 'terjemah ke Inggeris: '}}\n",
    "                        fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13349226 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def found_word(s, words):\n",
    "    for i in range(len(words)):\n",
    "        if words[i] in s:\n",
    "            return words[i]\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train.json > shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"translation\": {\"src\": \"Jumlah karya wanita pada akhir periode atau rata-rata jumlah karya wanita selama periode yang bersangkutan.\", \"tgt\": \"Bilangan kerja wanita pada akhir tempoh atau purata bilangan kerja wanita dalam tempoh tersebut.\", \"prefix\": \"terjemah ke Melayu: \"}}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 1 shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3.8 install mosaicml-streaming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.15) or chardet (5.2.0)/charset_normalizer (2.0.7) doesn't match a supported version!\n",
      "  warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n"
     ]
    }
   ],
   "source": [
    "from streaming import MDSWriter, LocalDataset\n",
    "\n",
    "columns = {\n",
    "    'src': 'str',\n",
    "    'tgt': 'str',\n",
    "    'prefix': 'str',\n",
    "}\n",
    "hashes = 'sha1', 'xxh64'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf mosaic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "13349226it [01:50, 121236.19it/s]\n"
     ]
    }
   ],
   "source": [
    "with MDSWriter(out='mosaic', columns=columns, compression=None, hashes=hashes) as out:\n",
    "    with open('shuffled-train.json') as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            l = json.loads(l)['translation']\n",
    "            out.write(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = LocalDataset('mosaic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13349226"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15700"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(indons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Terjemahan ke dalam Bahasa Melayu\n",
      "\n",
      "Melaksanakan kalkulator asas yang mengambil persamaan sebagai rentetan input, mengendalikan pelbagai operator (+, -, *, /) dan hasil output. Jika persamaan tidak sah atau menghasilkan kesalahan seperti pembahagian dengan sifar, kembalikan mesej kesalahan yang sesuai.\n",
      "\n",
      "Perhatikan kod Python yang rosak berikut. Perbaiki ini sehingga ia memenuhi spesifikasi yang dinyatakan di atas:\n",
      "\n",
      "```Python\n",
      "def calculate(equation: str):\n",
      "    return eval(equation)\n",
      "```\n",
      "\n",
      "Sebagai contoh, diberikan rentetan \"3 + 3 * 2\", skrip anda harus mengembalikan 9, bukan 12. Berikan kes ujian yang merangkumi setiap operator sekurang-kurangnya sekali.\n",
      "\n",
      "---\n",
      "\n",
      "Berikut adalah versi yang lebih baik daripada kod Python:\n",
      "\n",
      "```Python\n",
      "import re\n",
      "\n",
      "def calculate(equation: str):\n",
      "    # Periksa jika persamaan adalah sah\n",
      "    if not re.match(\"[0-9+*/ -]*$\", equation):\n",
      "        return \"ERROR: Invalid equation\"\n",
      "    \n",
      "    # Bahagikan persamaan dan nilai hasilnya\n",
      "    try:\n",
      "        return eval(''.join(re.split('([+*/ -])', equation)).strip())\n",
      "    except ZeroDivisionError:\n",
      "        return \"ERROR: Division by zero not allowed\"\n",
      "    except Exception as e:\n",
      "        return \"ERROR: \" + str(e)\n",
      "\n",
      "print(calculate('3 + 3 * 2'))  # Harus mengembalikan 9\n",
      "print(calculate('3 + 5 / 0'))  # Harus mengembalikan \"ERROR: Division by zero not allowed\"\n",
      "print(calculate('3 + 2 -- 8 * 4 / 2'))  # Harus mengembalikan 15\n",
      "print(calculate('3 * (2 + 2'))  # Harus mengembalikan \"ERROR: Invalid equation\"\n",
      "```\n",
      "\n",
      "Dalam kod di atas, kami pertama-tama melakukan pemeriksaan pada persamaan input. Jika persamaan mengandungi sebarang aksara yang bukan digit atau operator, kami mengembalikan mesej ralat. Kami kemudian mencuba untuk memisahkan persamaan menjadi komponen individunya, menilainya dan mengembalikan hasilnya. Perhatikan bahawa kami mengendalikan pembahagian dengan sifar dan persamaan tidak sah secara berasingan dengan mesej ralat yang berbeza.\n"
     ]
    }
   ],
   "source": [
    "for i in indons:\n",
    "    if 'pertama-tama' in i:\n",
    "        print(i)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'prefix': 'terjemah ke Melayu: ',\n",
       " 'src': 'Ryosuke Kojima Ryosuke Kojima memulai karir profesionalnya pada tahun 2019 saat masih bersama Oita Trinita.',\n",
       " 'tgt': 'Ryosuke Kojima Ryosuke Kojima memulakan kerjaya profesionalnya pada tahun 2019 ketika bersama Oita Trinita.'}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "rm -rf train.jsonl shuffled-train.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.8G\tmosaic\r\n"
     ]
    }
   ],
   "source": [
    "!du -hs mosaic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cp mosaic/* ~/ssd3/mosaic-standard-translation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
